# -*- coding: utf-8 -*-
"""
Created on Sat Jul 30 10:35:40 2022

@author: Wieczorek_W_Station
"""

##load files
import re, os, pandas as pd
from bs4 import BeautifulSoup as bs
import codecs
from langdetect import detect

types_of_encoding = ["utf8", "cp1252"]
#%%
## Create the dictionaries.
## Download the necessary data into the respective paths

root = "C:\\Users\\Wieczorek_W_Station\\Dropbox\\Andere Aufgaben\\Malzahn\\"
path = root + "tarim-brahmi_database\\"
dictionaries = path + "dictionaries\\"
fragments = path + "fragments\\"
output = root + "word_corpus_data\\"

try:
    os.makedirs(output)
except:
    pass


os.chdir(dictionaries)
#%%
# =============================================================================
# define functions
# =============================================================================
def check_nonetype(i):
    if re.search("NoneType",str(type(i))):
        out = ""
    else:
        out = i.getText()
    return(out)


#%%
# =============================================================================
# load and convert tocharian dictionaries into readable excel files 
# =============================================================================

os.chdir(dictionaries)
print(os.listdir())
rows = []


files = os.listdir()

for file in files:
    filename = file
    
    encoding_type = "utf-8"
    ## read xml-file and use different encodings if unicode decode error appears
    # for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = encoding_type, errors ='replace') as f:
        data = f.read()
            
    soup = bs(data, "xml")
    
    colnames = ["lemma",
               "meaning",
               "keywords",
               "language",
               "pos_class",
               "alternative_other_tocharian", 
               "loan_source"]
    
    
    
    ## check for language:
    if re.search("xto",filename):
        lang = "TA"
    else:
        lang = "TB"
    
    
    # =============================================================================
    # find all nouns
    # =============================================================================
    nouns = soup.findAll("pos-n")
    
    count = 0
    total = len(nouns)
    for noun in nouns[count:]:
        print("currently at ", count + 1, "of total nouns:", total)
        meaning = noun.find("meaning").getText() ##get description
        try:
            keywords = noun.findAll("keyword") ## get additional keywords for explanation
            keywords = " ".join([k.getText() for k in keywords])
        except:
            keywords = ""
        
        language = lang ## get language in corpus
        
        pos_val = noun.find("pos").get("val") ## get pos-tag
        
        lemma = noun.get("lemma") ## get lemma of noun
        
        
        ## select etym_ref if it is non NoneType
        
        etym_type = type(noun.find("etym"))
        if re.search("None",str(etym_type)):
            etym_ref = ""
            loan_source = "Tocharian"
        else:
        ## check, if there is field etym_type
            etym_type = type(noun.find("etym").get("ref"))
            if re.search("None",str(etym_type)):
                etym_ref = ""
            else:
                etym_ref = noun.find("etym").get("ref")
                
        
        
            etym_type = type(noun.find("etym").get("lang"))
            if re.search("None",str(etym_type)):
                loan_source = "Tocharian"
            else:
                loan_source = noun.find("etym").get("lang")
            
        
        rows.append((lemma,
                     meaning,
                     keywords,
                     language,
                     pos_val,
                     etym_ref,
                     loan_source))
        count +=1
## save data     
os.chdir(output)
noun_df = pd.DataFrame(rows, columns = colnames)
noun_df.to_excel("nouns_tocharian_a_and_b.xlsx")
#%%
# =============================================================================
# clean memory
# =============================================================================

del(colnames,count,data,encoding_type,etym_ref,f,file,filename,
    files,keywords,lang,language,lemma,loan_source,meaning,
    nouns,pos_val,rows,total)

#%%
# =============================================================================
# select names and group nouns by type
# =============================================================================

noun_df["select"] = [1 if re.match("personal name",x) else 0 for x in noun_df.keywords]
df = noun_df[noun_df.select == 1].drop(columns="select")
df.meaning = [x.lower() for x in df.meaning]

typelist = ["clan", 
            "god", "divine", 
            "buddha", "boddha", "bodhisattva", 
            "spirit", "ghost", "gandharva",  "world-guardian", "asura",
            "ruler","king","princess", "prince", "queen", "lady",
            "administrative", "prefekt", "official", "officer",
            "monastic",  "monk", "donor", "nun",
            "caravan", "commercial", "economic", "merchant",
            "brahmin",
            "colophon", "wooden tablet",
            ]

for t in typelist:
    df[t] = [1 if re.search(t,x) else 0 for x in df.meaning]


## define groups:
    
exchange = [
    ("gods", ["god", "divine"]),
    ("budda", ["buddha", "boddha", "bodhisattva", ]),
    ("ghosts_spirits", ["spirit", "ghost", "gandharva",  "world-guardian", "asura"]),
    ("rulers", ["ruler","king","princess", "prince", "queen", "lady"]),
    ("administrative", ["administrative", "prefekt", "official", "officer"]),
    ("monastary_religion", ["monastic",  "monk", "donor", "nun"]),
    ("economic", ["caravan", "commercial", "economic", "merchant"])
    ]
    
for e in exchange:
    print(e)
    df[e[0]] = [1 if x > 0 else 0 for x in df[e[1]].sum(axis=1)]

## export

df.to_excel("nouns_tocharian_a_and_b_classified.xlsx")

#%%

#%%

# =============================================================================
# load and convert tocharian dictionaries into readable excel files 
# =============================================================================

os.chdir(fragments)
# print(os.listdir())
rows = []
column_names = ["idno",
                "language",
                "title",
                "genre",
                "subgenre",
                "prose",
                "verse",
                "material",
                "medium",
                "region",
                "source_text",
                "prim_trans"]

files = os.listdir()

# file = files[0]

for file in files:
    
    filename = file
    ## read xml-file and use different encodings if unicode decode error appears
    # for encoding_type in types_of_encoding:
    with codecs.open(filename, encoding = "utf-8", errors ='replace') as f:
    # with open(filename, mode = "r", encoding = encoding_type) as f:
        data = f.read()
        # print(data)
        
    soup = bs(data, "xml")
    
    print("currently at data:", file)
    
    
    
    # =============================================================================
    # first part: go through header
    # =============================================================================
    
    header = soup.find("teiHeader")
    header_file_desc = header.find("fileDesc") # 
    
    try:
        idno = header_file_desc.find("msIdentifier").find("idno").text
    except:
        idno = ''
        
    #header_encoding_desc = header.find("encodingDesc") ## irrelevant
    header_profile_desc = header.find("profileDesc") # lookup for genre and language
    
    if re.search("NoneType",str(type(header_profile_desc))):
        lang = ''
    else:
        ## get language from header_profile_desc
        language = header_profile_desc.find("language")
        if re.search("NoneType", str(type(language))):
            language = ''
        else:
            language = language.attrs["ident"]
        
        
        if language == "xto":
            lang = "TA"
        elif language == '':
            lang = ''
        else:
            lang = "TB"    
        
    ## get title
    title = header.find("msItem")
    if re.search("NoneType",str(type(title))):
        title = ''
    else:
        title = title.find("title")
        title = check_nonetype(title)
    
    
    ## get information on genre, subgenre etc
    if re.search("NoneType",str(type(header_profile_desc))):
        genre = ''
        subgenre = ''
        prose_verse = ''
    else:
        keywords = header_profile_desc.find("keywords")
        if re.search("NoneType",str(type(keywords))):
            genre = ''
            subgenre = ''
            prose_verse = ''
        else:
            genre = keywords.find("term", attrs={"type":"genre"})
            subgenre = keywords.find("term", attrs={"type": "subgenre"})
            prose_verse = keywords.findAll("term", attrs={"type": "prose_verse"})
            
            genre = check_nonetype(genre)
            subgenre = check_nonetype(subgenre)
    
    
    # genre = check_nonetype(genre)
    # subgenre = check_nonetype(subgenre)
    prose_verse = [check_nonetype(x) for x in prose_verse]
    
    prose = 0
    verse = 0
    
    if not re.search("NoneType", str(type(prose_verse))):
        
        if "prose" in prose_verse:
            prose = 1
        if "verse" in prose_verse:
            verse = 1
    else:
        pass
        
    ## get finding spot
    region = header.find("region")
    
    region = check_nonetype(region)
    
    
    ## get material type
    meterial = check_nonetype(header.find("material"))
    
    ## get writing medium
    medium = header.find("handNote")
    
    if re.search("NoneType", str(type(medium))):
        medium = ''
    else:
        medium = medium.get_attribute_list("medium")[0]

    # =============================================================================
    # second part: go through sourceDoc
    # =============================================================================
    sourceDoc = soup.find("sourceDoc")
    if re.search("NoneType",str(type(sourceDoc))):
        source_text = ''
        prim_trans = ''
    else:
        
        source_text = " ".join([x.getText() for x in sourceDoc.findAll("zone")])
        source_text = re.sub("^[A-Za-z 0-9]","",source_text)
        
        # =============================================================================
        # third part: go through text
        # =============================================================================
        text = soup.find("text")
        
        ## get translation (where possible)
        
        prim_trans = text.findAll("seg", attrs = {"type":"prim_transl"})
        
        prim_trans = [x.getText() for x in prim_trans]
        prim_trans = [x for x in prim_trans if not re.match("[/]*[ ]*\.+[ ]*[/]*$",x)]
        prim_trans = [x for x in prim_trans if not re.match("^[/]*[ ]* [\[|\)][0-9]+",x)]
        # prim_trans = [x for x in prim_trans if detect(x) == "en"]
        
        prim_translations = []
        
        for x in prim_trans:
            try:
                trans_lang = detect(x)
                if trans_lang == "en":
                    prim_translations.append(x)
                else:
                    pass
            except:
                pass
        
        prim_trans = prim_translations
        
        prim_trans = " ".join(prim_trans)
        ## substitute patterns
        pattern = "(\|.+\||[0-9]+\.|\[|\]|\(?\)|\(|\?\)|///)"
        
        prim_trans = re.sub(pattern,"",prim_trans)
        

    
    rows.append((idno,
                 lang,
                 title,
                 genre,
                 subgenre,
                 prose, 
                 verse, 
                 meterial,
                 medium,
                 region, 
                 source_text, 
                 prim_trans))


tocharian_df = pd.DataFrame(rows, columns = column_names)
os.chdir(output)
tocharian_df.to_excel("tocharian_text_fragments.xlsx")
    